library(tidyverse); theme_set(theme_bw())
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(ggpubr)
library(cowplot)
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:ggpubr':
##
## get_legend
##
## The following object is masked from 'package:lubridate':
##
## stamp
library(ggbeeswarm)
library(fs)
# Get input files
kmer_QC_file <- "/labs/kingsley/ambenj/myosin_dups/analysis/ase/kmer_analysis/MYH_BEPA_RABS_27mers_QC.txt"
m_file <- "/labs/kingsley/ambenj/myosin_dups/analysis/ase/metadata.txt"
seq_stats_file <- "/labs/kingsley/ambenj/myosin_dups/raw/RNAseq/30-1151453543/sequencing_stats.csv"
kinnex_counts_file <- "/labs/kingsley/ambenj/myosin_dups/analysis/ase/flnc_mapping/align_RABS-3copy_BEPA-5copy/exons3-35_counts/exon3-35_counts_all.txt"
# Check a reads detected by multiple kmers are assigned to the same myosin copy and allele
r <- read_tsv("/labs/kingsley/ambenj/myosin_dups/analysis/ase/kmer_analysis/kmer_counts/9_matching_reads.txt", col_names = c("kmer", "read")) %>%
group_by(read) %>%
summarize(n = n(),
kmers = paste(sort(unique(kmer)),collapse=", ")) %>%
filter(n>1) %>%
mutate(C1 = ifelse(str_detect(kmers, "C1"), 1, 0),
C2 = ifelse(str_detect(kmers, "C2"), 1, 0),
C3 = ifelse(str_detect(kmers, "C3"), 1, 0),
BEPA = ifelse(str_detect(kmers, "BEPA"), 1, 0),
RABS = ifelse(str_detect(kmers, "RABS"), 1, 0),
MYH_copies_detect = C1 + C2 + C3,
alleles_detected = BEPA + RABS)
## Rows: 227291 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): kmer, read
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Count number of reads assigned to more than one MYH copy
r %>%
filter(MYH_copies_detect > 1)
# Are there any kmers assigned to more than one MYH copy that are more common than others?
r %>%
filter(MYH_copies_detect > 1) %>%
group_by(kmers) %>%
summarise(n=n()) %>%
arrange(-n)
# Count number of reads detected by both BEPA and RABS alleles
r %>%
filter(alleles_detected > 1, !str_detect(kmers, "C3dup"))
# Read metadata file
m <- read_tsv(m_file) %>%
mutate(Tissue_type = case_when(Tissue_type == "Whole larvae" ~ "Whole juvenile",
Tissue_type == "Flank muscle" ~ "Adult flank",
Tissue_type == "Abductor muscle" ~ "Adult abductor",
Tissue_type == "Adductor muscle" ~ "Adult adductor",
TRUE ~ Tissue_type),
Tissue_type = factor(Tissue_type, levels=c("Whole juvenile", "Adult flank", "Adult abductor", "Adult adductor")))
## Rows: 45 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (10): ID, Fish, Sample1, Sample2, Type, Age, Temperature, Tissue_type, C...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
m
# Read sequencing stats file
s <- read_csv(seq_stats_file)
## Rows: 45 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Project, Sample ID, Barcode Sequence
## dbl (4): # Reads, Yield (Mbases), Mean Quality Score, % Bases >= 30
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
s
# Read kmer information
k <- read_tsv(kmer_QC_file)
## Rows: 102 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (15): kmer_name, BLAT_QC_BEPA, BLAT_QC_RABS, Isoseq_QC, Sufficient cover...
## dbl (5): length, min, max, min_original, max_original
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Get list of kmer files per sample
kmer_files <- dir_ls(path = "/labs/kingsley/ambenj/myosin_dups/analysis/ase/kmer_analysis/kmer_counts", glob = "*_kmers_with_counts.tsv")
# function to add file name to dataframe
read_and_record_filename <- function(filename){
read_tsv(filename) %>%
mutate(filename = path_file(filename))
}
# gather kmer files from all samples into one dataframe
kmer_counts <- map_df(kmer_files, read_and_record_filename)%>%
separate(filename, into = c("sample"), sep = "_") %>%
left_join(.,k, by=c("kmer_name"="kmer_name")) %>%
mutate(group = case_when(str_detect(kmer_name, "C3dup") ~ "C3dup_ASE",
str_detect(kmer_name, "ASE") ~ "ASE",
str_detect(kmer_name, "MCE") ~ "MCE"))
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 1 pieces. Additional pieces discarded in 4590 rows [1, 2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
kmer_counts
Allele-specific expression analysis
# Read kmer counts file
df_ase <- kmer_counts %>%
filter(group == "ASE", use=="yes", spans_intron == "no") %>%
separate(kmer_name, into=c("source", "ASE"), remove=FALSE, extra="drop", sep="_") %>%
unite(kmer_pair, source, ASE, sep = "_", remove = FALSE) %>%
select(source, kmer_pair, use, exon, spans_intron, sample, allele, match_count, min) %>%
pivot_wider(id_cols = c(kmer_pair, use, sample, source, exon, spans_intron, min), names_from = allele, values_from = match_count) %>%
mutate(total_counts = BEPA + RABS,
include = case_when(BEPA >=1 & RABS >= 1 & total_counts >= 100 ~ "yes",
TRUE ~ "no"),
# BEPA = case_when(BEPA==0 ~ 1, # Add minimum of one read to counts so log scores wont be zero
# TRUE ~ BEPA),
# RABS = case_when(RABS==0 ~ 1,
# TRUE ~ RABS),
BEPA_RABS_ratio = BEPA / RABS,
BEPA_RABS_log2_ratio = log2(BEPA/RABS),
prop_BEPA = BEPA/(BEPA+RABS)) %>%
left_join(., m, by=c("sample" = "ID")) %>%
left_join(., s, by=c("sample" = "Sample ID")) %>%
mutate(RPM = (total_counts/ `# Reads`)*1000000) # Table contains read pairs
df_ase
df_ase %>%
group_by(kmer_pair) %>%
summarize(n=n()) %>%
arrange()
df_ase %>%
filter(kmer_pair == "C1_ASE2") %>%
group_by(Tissue_type, Temperature) %>%
summarize(n=n())
## `summarise()` has grouped output by 'Tissue_type'. You can override using the
## `.groups` argument.
# Perform two-sided, one sample Wilcoxon test (safer to use with small n=3-10) to determine if sample is significantly greater than 0
test_results <- df_ase %>%
group_by(kmer_pair, Temperature, source, Tissue_type) %>%
filter(include == "yes") %>%
summarize(
n = sum(!is.na(BEPA_RABS_log2_ratio)),
p_value = if (n >= 3) wilcox.test(prop_BEPA, mu = 0.5)$p.value else NA_real_,
# p_value = if (n >= 3) wilcox.test(BEPA_RABS_log2_ratio, mu = 0, alternative = "greater")$p.value else NA_real_,
.groups = "drop"
) %>%
mutate(
significance = case_when(
is.na(p_value) ~ NA_character_,
p_value <= 0.001 ~ "***",
p_value <= 0.01 ~ "**",
p_value <= 0.05 ~ "*",
TRUE ~ "ns"
)
)
# Add y-label positions for plotting significance results
label_positions <- df_ase %>%
filter(include == "yes") %>%
group_by(kmer_pair,Temperature, source, Tissue_type) %>%
summarize(y_pos = max(prop_BEPA, na.rm = TRUE) + 0.05, .groups = "drop")
test_results <- left_join(test_results, label_positions,
by = c("kmer_pair", "Temperature", "source", "Tissue_type"))
# Plot with significance values
ASE_plot <- df_ase %>%
filter(include == "yes") %>%
ggplot(aes(reorder(kmer_pair, min), prop_BEPA, color = Temperature)) +
geom_hline(yintercept = 0.5, linetype = "dashed") +
geom_boxplot(fill = NA, outlier.shape = NA) +
geom_beeswarm(cex=1, dodge.width = .7, alpha = 0.7, size=2) +
scale_color_manual(values = c("black", "grey60")) +
facet_grid(cols = vars(source), rows = vars(Tissue_type), scales = "free_x", space = "free") +
geom_text(
data = test_results %>% filter(!is.na(significance)),
aes(x = kmer_pair, y = y_pos, label = significance, group = Temperature),
position = position_dodge(width = 0.8),
inherit.aes = FALSE,
size = 4) +
stat_compare_means(method = "wilcox.test", label = "p.signif", vjust=0.75) +
theme_cowplot(10) +
panel_border(color="black", size=0.75) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ylim(0.2, 0.9)
ASE_plot

# Save ASE figure to separate file
ggsave(
"/labs/kingsley/ambenj/myosin_dups/analysis/ase/figures/ASE_plot_twosided.pdf",
plot = ASE_plot,
scale = 1,
width = 12,
height = 10,
units = c("in"),
dpi = 300,
)
# Perform two-sided, one sample Wilcoxon test (safer to use with small n=3-10) to determine if sample is significantly greater than 0
test_results <- df_ase %>%
group_by(kmer_pair, Temperature, source, Tissue_type) %>%
filter(include == "yes") %>%
summarize(
n = sum(!is.na(BEPA_RABS_log2_ratio)),
p_value = if (n >= 3) wilcox.test(BEPA_RABS_log2_ratio, mu = 0, alternative = "greater")$p.value else NA_real_,
.groups = "drop"
) %>%
mutate(
significance = case_when(
is.na(p_value) ~ NA_character_,
p_value <= 0.001 ~ "***",
p_value <= 0.01 ~ "**",
p_value <= 0.05 ~ "*",
TRUE ~ "ns"
)
)
# Add y-label positions for plotting significance results
label_positions <- df_ase %>%
filter(include == "yes") %>%
group_by(kmer_pair,Temperature, source, Tissue_type) %>%
summarize(y_pos = max(BEPA_RABS_log2_ratio, na.rm = TRUE) + 0.3, .groups = "drop")
test_results <- left_join(test_results, label_positions,
by = c("kmer_pair", "Temperature", "source", "Tissue_type"))
# Plot with significance values
df_ase %>%
filter(include == "yes") %>%
ggplot(aes(reorder(kmer_pair, min), BEPA_RABS_log2_ratio, color = Temperature)) +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_boxplot(fill = NA, outlier.shape = NA) +
geom_beeswarm(cex=1, dodge.width = .7, alpha = 0.7, size=2) +
scale_color_manual(values = c("black", "grey60")) +
facet_grid(cols = vars(source), rows = vars(Tissue_type), scales = "free_x", space = "free") +
geom_text(
data = test_results %>% filter(!is.na(significance)),
aes(x = kmer_pair, y = y_pos, label = significance, group = Temperature),
position = position_dodge(width = 0.8),
inherit.aes = FALSE,
size = 4) +
stat_compare_means(method = "wilcox.test", label = "p.signif", vjust=0.75) +
theme_cowplot(10) +
panel_border(color="black", size=0.75) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ylim(-1.5, 3.1)

# Collapse data for each kmer into mean across kmers per condition
means <- df_ase %>%
filter(include == "yes") %>%
# filter(include == "yes", source == "C3" | (source=="C2" & Tissue_type == "Whole juvenile")) %>%
group_by(source, Tissue_type, Temperature, sample) %>%
summarize(n = n(),
mean_BEPA_RABS_log2_ratio = mean(BEPA_RABS_log2_ratio),
geom_mean_prop_BEPA = exp(mean(log(prop_BEPA))),
mean_prop_BEPA = mean(prop_BEPA),
BEPA_total = sum(BEPA),
RABS_total = sum(RABS),
total_prop_BEPA = BEPA_total/ (BEPA_total + RABS_total)) %>%
filter((source == "C2" & n==4)|(source=="C3" & n==9)) # Require all kmers to be called
## `summarise()` has grouped output by 'source', 'Tissue_type', 'Temperature'. You
## can override using the `.groups` argument.
means
# Perform two-sided, one sample Wilcoxon test (safer to use with small n=3-10) to determine if sample is significantly greater than 0
test_results_mean <- means %>%
group_by(Temperature, source, Tissue_type) %>%
summarize(
n = sum(!is.na(mean_prop_BEPA)),
p_value = if (n >= 3) wilcox.test(mean_prop_BEPA, mu = 0.5)$p.value else NA_real_,
.groups = "drop"
) %>%
mutate(
significance = case_when(
is.na(p_value) ~ NA_character_,
p_value <= 0.001 ~ "***",
p_value <= 0.01 ~ "**",
p_value <= 0.05 ~ "*",
TRUE ~ "ns"
)
)
# Add y-label positions for plotting significance results
label_positions_means <- means %>%
group_by(Temperature, source, Tissue_type) %>%
summarize(y_pos = max(mean_prop_BEPA, na.rm = TRUE) + 0.015, .groups = "drop")
test_results_means <- left_join(test_results_mean, label_positions_means,
by = c("Temperature", "source", "Tissue_type"))
# Plot collapsed ASE results using arithmetic mean of proportion BEPA
ase_means_temp_plot <- means %>%
ggplot(aes(Tissue_type, mean_prop_BEPA, color = Temperature)) +
geom_hline(yintercept = 0.5, linetype = "dashed") +
geom_boxplot(fill = NA, outlier.shape = NA) +
geom_beeswarm(cex=2, dodge.width = .7, alpha = 0.7, size=2) +
scale_color_manual(values = c("black", "grey60")) +
scale_y_continuous(labels = percent, limits=c(0.45, 0.75)) +
facet_grid(cols = vars(source), scales = "free_x", space = "free") +
geom_text(
data = test_results_means %>% filter(!is.na(significance)),
aes(x = Tissue_type, y = y_pos, label = significance, group = Temperature),
position = position_dodge(width = 0.8),
inherit.aes = FALSE,
size = 4) +
stat_compare_means(method = "wilcox.test", label = "p.signif", vjust = 0.5) +
theme_cowplot(7) +
panel_border(color="black", size=0.75) +
scale_x_discrete(labels = label_wrap(10)) +
theme(legend.position = "top") +
# ylim(0.45, 0.75) +
xlab("Tissue") +
ylab("Percent BEPA allele expression")
ase_means_temp_plot

# Plot collapsed ASE results using arithmetic mean of kmers
ase_means_plot <- means %>%
filter(Temperature == "18C") %>%
ggplot(aes(Tissue_type, mean_prop_BEPA, color = source, fill=source)) +
geom_hline(yintercept = 0.5, linetype = "dashed") +
geom_boxplot(fill = NA, outlier.shape = NA) +
geom_beeswarm(cex=2, dodge.width = .7, alpha = 0.7, size=2, shape=21) +
scale_fill_manual(values = c("#f0e442", "#009e73")) +
scale_color_manual(values = c("#c6ba10", "#009e73")) +
scale_y_continuous(labels = percent) +
facet_grid(cols = vars(source), scales = "free_x", space = "free") +
geom_text(
data = test_results_means %>% filter(!is.na(significance), Temperature == "18C"),
aes(x = Tissue_type, y = y_pos, label = significance, group = Temperature),
position = position_dodge(width = 0.8),
inherit.aes = FALSE,
size = 4) +
theme_cowplot(7) +
panel_border(color="black", size=0.75) +
scale_x_discrete(labels = label_wrap(10)) +
theme(legend.position = "none") +
xlab("Tissue") +
ylab("Percent BEPA allele expression")
ase_means_plot

# Perform two-sided, one sample Wilcoxon test (safer to use with small n=3-10) to determine if sample is significantly greater than 0
test_results_mean <- means %>%
group_by(Temperature, source, Tissue_type) %>%
summarize(
n = sum(!is.na(geom_mean_prop_BEPA)),
p_value = if (n >= 3) wilcox.test(geom_mean_prop_BEPA, mu = 0.5)$p.value else NA_real_,
.groups = "drop"
) %>%
mutate(
significance = case_when(
is.na(p_value) ~ NA_character_,
p_value <= 0.001 ~ "***",
p_value <= 0.01 ~ "**",
p_value <= 0.05 ~ "*",
TRUE ~ "ns"
)
)
# Add y-label positions for plotting significance results
label_positions_means <- means %>%
group_by(Temperature, source, Tissue_type) %>%
summarize(y_pos = max(geom_mean_prop_BEPA, na.rm = TRUE) + 0.015, .groups = "drop")
test_results_means <- left_join(test_results_mean, label_positions_means,
by = c("Temperature", "source", "Tissue_type"))
# Plot collapsed ASE results using geometric mean of kmers
means %>%
ggplot(aes(Tissue_type, geom_mean_prop_BEPA, color = Temperature)) +
geom_hline(yintercept = 0.5, linetype = "dashed") +
geom_boxplot(fill = NA, outlier.shape = NA) +
geom_beeswarm(cex=2, dodge.width = .7, alpha = 0.7, size=3) +
scale_color_manual(values = c("black", "grey60")) +
facet_grid(cols = vars(source), scales = "free_x", space = "free") +
geom_text(
data = test_results_means %>% filter(!is.na(significance)),
aes(x = Tissue_type, y = y_pos, label = significance, group = Temperature),
position = position_dodge(width = 0.8),
inherit.aes = FALSE,
size = 4) +
stat_compare_means(method = "wilcox.test", label = "p.signif", vjust = 0.5) +
theme_cowplot(10) +
panel_border(color="black", size=0.75) +
scale_x_discrete(labels = label_wrap(10)) +
# ylim(0.18,0.82) +
xlab("Tissue type") +
ylab("Proportion of expression from BEPA allele")

# Plot collapsed ASE results using geometric mean of kmers
means %>%
filter(Temperature == "18C") %>%
ggplot(aes(Tissue_type, geom_mean_prop_BEPA, color = source, fill=source)) +
geom_hline(yintercept = 0.5, linetype = "dashed") +
geom_boxplot(fill = NA, outlier.shape = NA) +
geom_beeswarm(cex=2, dodge.width = .7, alpha = 0.7, size=3, shape=21) +
scale_fill_manual(values = c("#f0e442", "#009e73")) +
scale_color_manual(values = c("#c6ba10", "#009e73")) +
facet_grid(cols = vars(source), scales = "free_x", space = "free") +
geom_text(
data = test_results_means %>% filter(!is.na(significance), Temperature == "18C"),
aes(x = Tissue_type, y = y_pos, label = significance, group = Temperature),
position = position_dodge(width = 0.8),
inherit.aes = FALSE,
size = 4) +
theme_cowplot(6) +
panel_border(color="black", size=0.75) +
scale_x_discrete(labels = label_wrap(10)) +
theme(legend.position = "none") +
# ylim(0.18,0.82) +
xlab("Tissue type") +
ylab("Proportion of expression from BEPA allele")

# Perform two-sided, one sample Wilcoxon test (safer to use with small n=3-10) to determine if sample is significantly greater than 0
test_results_mean <- means %>%
group_by(Temperature, source, Tissue_type) %>%
summarize(
n = sum(!is.na(mean_BEPA_RABS_log2_ratio)),
p_value = if (n >= 3) wilcox.test(mean_BEPA_RABS_log2_ratio, mu = 0)$p.value else NA_real_,
.groups = "drop"
) %>%
mutate(
significance = case_when(
is.na(p_value) ~ NA_character_,
p_value <= 0.001 ~ "***",
p_value <= 0.01 ~ "**",
p_value <= 0.05 ~ "*",
TRUE ~ "ns"
)
)
# Add y-label positions for plotting significance results
label_positions_means <- means %>%
group_by(Temperature, source, Tissue_type) %>%
# summarize(y_pos = max(mean_BEPA_RABS_log2_ratio, na.rm = TRUE) + 0.03, .groups = "drop")
summarize(y_pos = max(mean_BEPA_RABS_log2_ratio, na.rm = TRUE) + 0.1, .groups = "drop")
test_results_means <- left_join(test_results_mean, label_positions_means,
by = c("Temperature", "source", "Tissue_type"))
# Plot collapsed ASE results
means %>%
ggplot(aes(Tissue_type, mean_BEPA_RABS_log2_ratio, color = Temperature)) +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_boxplot(fill = NA, outlier.shape = NA) +
geom_beeswarm(cex=2, dodge.width = .7, alpha = 0.7, size=3) +
scale_color_manual(values = c("black", "grey60")) +
facet_grid(cols = vars(source), scales = "free_x", space = "free") +
geom_text(
data = test_results_means %>% filter(!is.na(significance)),
aes(x = Tissue_type, y = y_pos, label = significance, group = Temperature),
position = position_dodge(width = 0.8),
inherit.aes = FALSE,
size = 4) +
stat_compare_means(method = "wilcox.test", label = "p.signif", vjust = 0.5) +
theme_cowplot(10) +
panel_border(color="black", size=0.75) +
scale_x_discrete(labels = label_wrap(10)) +
# ylim(0.18,0.82) +
# ylim(-2.1, 2.1) +
xlab("Tissue type") +
ylab("log2(BEPA/RABS)")

# Plot collapsed ASE results for just 18C
means %>%
filter(Temperature == "18C") %>%
ggplot(aes(Tissue_type, mean_BEPA_RABS_log2_ratio, color = source, fill=source)) +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_boxplot(fill = NA, outlier.shape = NA) +
geom_beeswarm(cex=2, dodge.width = .7, alpha = 0.7, size=3, shape=21) +
scale_fill_manual(values = c("#f0e442", "#009e73")) +
scale_color_manual(values = c("#c6ba10", "#009e73")) +
facet_grid(cols = vars(source), scales = "free_x", space = "free") +
geom_text(
data = test_results_means %>% filter(!is.na(significance), Temperature =="18C"),
aes(x = Tissue_type, y = y_pos, label = significance, group = Temperature),
position = position_dodge(width = 0.8),
inherit.aes = FALSE,
size = 4) +
theme_cowplot(10) +
panel_border(color="black", size=0.75) +
scale_x_discrete(labels = label_wrap(10)) +
# ylim(0.18,0.82) +
ylim(-0.5, 1.5) +
xlab("Tissue type") +
ylab("log2(BEPA/RABS)")

comps <- list(c("Whole juvenile", "Adult flank"), c("Adult flank", "Adult abductor"), c("Adult abductor", "Adult adductor"),
c("Whole juvenile", "Adult abductor"), c("Whole juvenile", "Adult adductor"), c("Adult flank", "Adult adductor"))
# Plot just 18C
means %>%
filter(Temperature == "18C") %>%
ggplot(aes(Tissue_type, mean_prop_BEPA)) +
geom_hline(yintercept = 0.5, linetype = "dashed") +
geom_boxplot(fill = NA, outlier.shape = NA) +
geom_beeswarm(cex=2, dodge.width = .7, alpha = 0.7, size=3) +
stat_compare_means(method = "wilcox.test", label = "p.signif", vjust = 0.5, comparisons = comps) +
facet_grid(cols = vars(source), scales = "free_x", space = "free")
## Warning: Computation failed in `stat_signif()`.
## Caused by error in `wilcox.test.default()`:
## ! not enough 'y' observations

# Plot just 18C
means %>%
filter(Temperature == "18C", Tissue_type == "Whole juvenile") %>%
ggplot(aes(source, mean_prop_BEPA)) +
geom_hline(yintercept = 0.5, linetype = "dashed") +
geom_boxplot(fill = NA, outlier.shape = NA) +
geom_beeswarm(cex=2, dodge.width = .7, alpha = 0.7, size=3) +
stat_compare_means(method = "wilcox.test", vjust = 0.5)

# Plot total counts normalized to number of reads sequenced in each library
df_ase %>%
ggplot(aes(kmer_pair, RPM, color= Temperature)) +
geom_beeswarm(cex=1.2, dodge.width = .7, alpha = 0.5, size=2) +
geom_boxplot(position = position_dodge(width = 0.8), fill = NA, outlier.shape = NA) +
scale_color_manual(values = c("black", "grey60")) +
facet_grid(cols=vars(source), rows = vars(Tissue_type), scales = "free_x", space= "free") +
stat_compare_means(method = "wilcox.test", label = "p.signif") +
theme_cowplot(10) +
panel_border(color="black", size=0.75) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# Plot total counts (log10 scale) normalized to number of reads sequenced in each library
total_expression <- df_ase %>%
ggplot(aes(kmer_pair, log10(RPM+1), color= Temperature)) +
geom_beeswarm(cex=1.2, dodge.width = .7, alpha = 0.5, size=2) +
geom_boxplot(position = position_dodge(width = 0.8), fill = NA, outlier.shape = NA) +
scale_color_manual(values = c("black", "grey60")) +
facet_grid(cols=vars(source), rows = vars(Tissue_type), scales = "free_x", space= "free") +
stat_compare_means(method = "wilcox.test", label = "p.signif", vjust = 0.75) +
theme_cowplot(10) +
panel_border(color="black", size=0.75) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ylim(0, 4)
total_expression

# Save ASE figure to separate file
ggsave(
"/labs/kingsley/ambenj/myosin_dups/analysis/ase/figures/totalexpressionASE_log10RPM_plot.pdf",
plot = total_expression,
scale = 1,
width = 12,
height = 10,
units = c("in"),
dpi = 300,
)
total_counts_means <- df_ase %>%
group_by(source, Tissue_type, Temperature, sample) %>%
summarize(mean_log10RPM = mean(log10(RPM+1)))
## `summarise()` has grouped output by 'source', 'Tissue_type', 'Temperature'. You
## can override using the `.groups` argument.
total_counts_means %>%
ggplot(aes(Tissue_type, mean_log10RPM, color = Temperature)) +
geom_beeswarm(cex=1.2, dodge.width = .7, alpha = 0.5, size=2) +
geom_boxplot(position = position_dodge(width = 0.8), fill = NA, outlier.shape = NA) +
scale_color_manual(values = c("black", "grey60")) +
facet_grid(cols = vars(source), scales = "free_x", space = "free") +
stat_compare_means(method = "wilcox.test", label = "p.signif", vjust = 0.25) +
theme_cowplot(10) +
panel_border(color="black", size=0.75) +
scale_x_discrete(labels = label_wrap(10)) +
xlab("Tissue type") +
ylab("log10(RPM+1)")

Total count analysis from myosin copy expression (MCE) kmers
# Read kmer counts file and get MCE kmers
df_mce <- kmer_counts %>%
filter(group == "MCE", use=="yes", spans_intron=="no") %>%
separate(kmer_name, into=c("kmer_group"), remove=FALSE, extra="drop", sep="_") %>%
left_join(., m, by=c("sample" = "ID")) %>%
left_join(., s, by=c("sample" = "Sample ID")) %>%
mutate(RPM = (match_count/ `# Reads`)*1000000) # Table contains read pairs
# Plot total counts (log10 scale) normalized to number of reads sequenced in each library
mce_plot <- df_mce %>%
ggplot(aes(kmer_group, log10(RPM+1), color= Temperature)) +
geom_beeswarm(cex=1.2, dodge.width = .7, alpha = 0.5, size=2) +
geom_boxplot(position = position_dodge(width = 0.8), fill = NA, outlier.shape = NA) +
scale_color_manual(values = c("black", "grey60")) +
theme_cowplot(10) +
panel_border(color="black", size=0.75) +
facet_grid(cols=vars(source), rows = vars(Tissue_type), scales = "free_x", space= "free") +
stat_compare_means(method = "wilcox.test", label = "p.signif", vjust = 0.75) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ylim(0, 3.8)
mce_plot

# Plot total counts (log10 scale) normalized to number of reads sequenced in each library
df_mce %>%
filter(Temperature == "18C") %>%
ggplot(aes(kmer_group, log10(RPM+1), color= source, fill=source)) +
geom_beeswarm(cex=1.2, dodge.width = .7, alpha = 0.5, size=2, shape=21) +
geom_boxplot(position = position_dodge(width = 0.8), fill = NA, outlier.shape = NA) +
scale_fill_manual(values = c("#e69f00", "#f0e442", "#009e73")) +
scale_color_manual(values = c("#e69f00", "#c6ba10", "#009e73")) +
theme_cowplot(10) +
panel_border(color="black", size=0.75) +
facet_grid(cols=vars(source), rows = vars(Tissue_type), scales = "free_x", space= "free") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ylim(0, 3.8)

# Average kmers
mce_means <- df_mce %>%
group_by(source, Tissue_type, Temperature, sample) %>%
summarize(mean_log10RPM = mean(log10(RPM+1)))
## `summarise()` has grouped output by 'source', 'Tissue_type', 'Temperature'. You
## can override using the `.groups` argument.
mce_means_temp_plot <- mce_means %>%
ggplot(aes(Tissue_type, mean_log10RPM, color = Temperature)) +
geom_beeswarm(cex=1.2, dodge.width = .7, alpha = 0.5, size=2) +
geom_boxplot(position = position_dodge(width = 0.8), fill = NA, outlier.shape = NA) +
facet_grid(cols = vars(source), scales = "free_x", space = "free") +
stat_compare_means(method = "wilcox.test", label = "p.signif", vjust = 0.25) +
scale_color_manual(values = c("black", "grey60")) +
theme_cowplot(7) +
theme(legend.position = "top") +
panel_border(color="black", size=0.75) +
scale_x_discrete(labels = label_wrap(10)) +
ylim(0,3.1) +
xlab("Tissue") +
ylab("Total expression, log10(RPM+1)")
mce_means_temp_plot

mce_means %>%
filter(Temperature == "18C") %>%
ggplot(aes(Tissue_type, mean_log10RPM, fill = source, color=source)) +
geom_boxplot(position = position_dodge(width = 0.8), fill = NA, outlier.shape = NA) +
geom_beeswarm(cex=1.2, dodge.width = .7, alpha = 0.7, size=3, shape=21) +
# facet_grid(cols = vars(source), scales = "free_x", space = "free") +
# stat_compare_means(method = "wilcox.test", label = "p.signif", vjust = 0.25) +
theme_cowplot(10) +
panel_border(color="black", size=0.75) +
scale_fill_manual(values = c("#e69f00", "#f0e442", "#009e73")) +
scale_color_manual(values = c("#e69f00", "#c6ba10", "#009e73")) +
scale_x_discrete(labels = label_wrap(10)) +
ylim(0,3) +
xlab("Tissue type") +
ylab("log10(RPM+1)")

mce_means_plot <- mce_means %>%
filter(Temperature == "18C") %>%
ggplot(aes(Tissue_type, mean_log10RPM, color=source, fill=source)) +
geom_boxplot(position = position_dodge(width = 0.8), fill = NA, outlier.shape = NA) +
geom_beeswarm(cex=1.2, dodge.width = .7, alpha = 0.7, size=2, shape=21) +
facet_grid(cols = vars(source), scales = "free_x", space = "free") +
scale_fill_manual(values = c("#e69f00", "#f0e442", "#009e73")) +
scale_color_manual(values = c("#e69f00", "#c6ba10", "#009e73")) +
theme_cowplot(7) +
panel_border(color="black", size=0.75) +
scale_x_discrete(labels = label_wrap(10)) +
theme(legend.position = "none") +
ylim(0,3) +
xlab("Tissue") +
ylab("Total expression, log10(RPM+1)")
mce_means_plot

mce_means %>%
filter(Temperature == "18C") %>%
ggplot(aes(Tissue_type, mean_log10RPM)) +
geom_boxplot(position = position_dodge(width = 0.8), fill = NA, outlier.shape = NA) +
geom_beeswarm(cex=1.2, dodge.width = .7, alpha = 0.5, size=3) +
facet_grid(cols = vars(source), scales = "free_x", space = "free") +
theme_cowplot(10) +
panel_border(color="black", size=0.75) +
scale_x_discrete(labels = label_wrap(10)) +
ylim(0,3) +
xlab("Tissue type") +
ylab("log10(RPM+1)")

# # Get count ratios for kmer that can distingish BEPA copies as well as RABS
# df_wide_C3_copies <- df %>%
# filter(kmer %in% c("CTAGAGGGAGCAATGAAAGAGGCTCGTTCT", "CTAGAGGGAGCATTGAAAGAGGCTCGTTCT", "CTTGAGGGAGCATTGAAGGAGGCTCGTTCT")) %>%
# select(source, exon, sample, allele, total_ASEcounts) %>%
# pivot_wider(id_cols = c(source, exon, sample), names_from = allele, values_from = total_ASEcounts) %>%
# mutate(total_counts = `BEPA-ab` + `BEPA-c` + RABS) %>%
# mutate(BEPA_RABS_ratio = (`BEPA-ab` + `BEPA-c`) / RABS)
#
# df_wide_C3_copies
# df %>%
# filter(kmer %in% c("CTAGAGGGAGCAATGAAAGAGGCTCGTTCT", "CTAGAGGGAGCATTGAAAGAGGCTCGTTCT", "CTTGAGGGAGCATTGAAGGAGGCTCGTTCT")) %>%
# left_join(., m, by=c("sample" = "ID")) %>%
# ggplot(aes(sample, total_ASEcounts, fill=factor(allele, levels=c("RABS", "BEPA-ab", "BEPA-c")))) +
# geom_bar(position="fill", stat="identity") +
# scale_fill_manual(values = c("#D16D6A", "#4B77D1", "#CCDAF5"), name = "C3 copy") +
# geom_hline(yintercept = 0.5, linetype = "dashed") +
# theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
# ylab("Proportion of expression") +
# facet_grid(cols=vars(Tissue_type, Temperature), scales = "free_x", space= "free")
Kinnex data analysis
k <- read_tsv(kinnex_counts_file) %>%
separate(sample, into=c("ID"), sep="_") %>%
separate(name, into=c("allele", "MYH_copy"), sep="_", remove=FALSE) %>%
mutate(allele=case_when(MYH_copy=="C3A" ~ "BEPA-C3A",
MYH_copy=="C3B" ~ "BEPA-C3B",
MYH_copy=="C3L" ~ "BEPA-C3L",
MYH_copy=="C3" ~ "RABS-C3",
TRUE ~ allele),
allele = factor(allele, levels=c("RABS", "BEPA", "RABS-C3", "BEPA-C3A", "BEPA-C3B", "BEPA-C3L"))) %>%
left_join(., m)
## Rows: 128 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): name, chr, sample
## dbl (3): start, stop, read_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 1 pieces. Additional pieces discarded in 128 rows [1, 2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## Warning: Expected 2 pieces. Additional pieces discarded in 128 rows [1, 2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## Joining with `by = join_by(ID)`
k %>%
filter(str_detect(MYH_copy, "C3")) %>%
ggplot(aes(ID, read_count, fill=allele)) +
geom_bar(position="fill", stat="identity", color="black") +
scale_fill_manual(values = c("#d73027", "#0072b2", "#56afe1","#c7e4f5"), name = "MYH3 copy") +
geom_hline(yintercept = 0.5, linetype = "dashed") +
theme_cowplot(10) +
panel_border(color="black", size=0.75) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ylab("Proportion of expression") +
facet_grid(cols=vars(Tissue_type, Temperature), scales = "free_x", space= "free")

k_C3_temp_plot <- k %>%
filter(str_detect(MYH_copy, "C3")) %>%
ggplot(aes(ID, read_count, fill=allele)) +
geom_bar(position="fill", stat="identity", color="black") +
scale_fill_manual(values = c("#d73027", "#0072b2", "#56afe1","#c7e4f5"), name = "MYH3 copy") +
# scale_fill_manual(values = c("#D16D6A", "#4B77D1", "#789DE6","#CCDAF5"), name = "MYH3 copy") +
scale_y_continuous(expand = expansion(mult = c(0, 0)),
breaks = seq(0, 1, by = 0.1),
labels = percent) +
geom_hline(yintercept = 0.5, linetype = "dashed") +
theme_cowplot(7) +
# panel_border(color="black", size=0.75) +
theme(axis.text.x = element_blank(),
axis.ticks.x=element_blank(),
strip.background = element_rect(colour=NA, fill=NA),
strip.placement = "outside",
legend.position = "none") +
ylab("Percent of total C3 expression") +
xlab("Tissue") +
facet_grid(cols=vars(Tissue_type, Temperature), scales = "free_x", space= "free",
labeller = label_wrap_gen(width = 2, multi_line = TRUE),
switch = "x")
k_C3_temp_plot

k_C3_plot <- k %>%
filter(str_detect(MYH_copy, "C3"), Temperature=="18C") %>%
ggplot(aes(ID, read_count, fill=allele)) +
geom_bar(position="fill", stat="identity", color="black") +
scale_fill_manual(values = c("#d73027", "#0072b2", "#56afe1","#c7e4f5"), name = "MYH3 copy") +
# scale_fill_manual(values = c("#D16D6A", "#4B77D1", "#789DE6","#CCDAF5"), name = "MYH3 copy") +
scale_y_continuous(expand = expansion(mult = c(0, 0)),
breaks = seq(0, 1, by = 0.1),
labels = percent) +
geom_hline(yintercept = 0.5, linetype = "dashed") +
theme_cowplot(7) +
# panel_border(color="black", size=0.75) +
theme(axis.text.x = element_blank(),
axis.ticks.x=element_blank(),
strip.background = element_rect(colour=NA, fill=NA),
strip.placement = "outside",
legend.position = "none") +
ylab("Percent of total C3 expression") +
xlab("Tissue") +
facet_grid(cols=vars(Tissue_type), scales = "free_x", space= "free",
labeller = label_wrap_gen(width = 2, multi_line = TRUE),
switch = "x")
k_C3_plot

k %>%
filter(str_detect(MYH_copy, "C2")) %>%
ggplot(aes(ID, read_count, fill=allele)) +
geom_bar(position="fill", stat="identity", color="black") +
scale_fill_manual(values = c("#d73027", "#0072b2", "#56afe1","#c7e4f5"), name = "MYH3C2 copy") +
scale_y_continuous(expand = expansion(mult = c(0, 0))) +
geom_hline(yintercept = 0.5, linetype = "dashed") +
theme_cowplot(10) +
panel_border(color="black", size=0.75) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ylab("Proportion of expression") +
facet_grid(cols=vars(Tissue_type, Temperature), scales = "free_x", space= "free")

k %>%
filter(str_detect(MYH_copy, "C2"), Temperature=="18C", Tissue_type=="Whole juvenile") %>%
ggplot(aes(ID, read_count, fill=allele)) +
geom_bar(position="fill", stat="identity", color="black") +
scale_fill_manual(values = c("#d73027", "#0072b2", "#56afe1","#c7e4f5"), name = "MYH3C2 copy") +
scale_y_continuous(expand = expansion(mult = c(0, 0)),
breaks = seq(0, 1, by = 0.1)) +
geom_hline(yintercept = 0.5, linetype = "dashed") +
theme_cowplot(10) +
panel_border(color="black", size=0.75) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ylab("Proportion of expression") +
facet_grid(cols=vars(Tissue_type), scales = "free_x", space= "free")

Combine into final plot
row1 <- plot_grid(mce_means_plot, labels = c('B'), nrow=1)
d_panel <- plot_grid(NULL, k_C3_plot, ncol=1, rel_heights = c(0.2, 1))
row2 <- plot_grid(ase_means_plot, d_panel, labels = c('C','D'), nrow=1, rel_widths = c(1, 1))
final_plot <- plot_grid(row1, row2, label_size = 12, nrow=2)
final_plot

# Save final figure to separate file
ggsave(
"/labs/kingsley/ambenj/myosin_dups/analysis/ase/figures/expression_plot_v2.pdf",
plot = final_plot,
width = 4.5,
height = 4.5,
units = c("in"),
dpi = 300,
)
# Make supplemental figure
row1 <- plot_grid(mce_means_temp_plot, ase_means_temp_plot, labels = c('A', 'B'), nrow=1, rel_widths = c(2, 1))
row2 <- plot_grid(k_C3_temp_plot,NULL, labels = c('C'), nrow=1, rel_widths = c(3, 1.25))
final_plot <- plot_grid(row1, row2, label_size = 12, nrow=2, rel_heights = c(1.2, 1))
final_plot

# Save final figure to separate file
ggsave(
"/labs/kingsley/ambenj/myosin_dups/analysis/ase/figures/temp_expression_supp_figure.pdf",
plot = final_plot,
width = 6.5,
height = 6,
units = c("in"),
dpi = 300,
)
# Make final kmer table
k